#!/usr/bin/env bash

# Do "ps aux" on a set of nodes belonging to a single job, but exclude system processes.
export usage="$0 [-c columns | -h] jobid"

# Author: Ole.H.Nielsen@fysik.dtu.dk
# Homepage: https://github.com/OleHolmNielsen/Slurm_tools/

# Requires ClusterShell with configuration for parallel commands on Slurm jobs,
# see https://wiki.fysik.dtu.dk/niflheim/SLURM#clustershell
# and https://clustershell.readthedocs.io/en/latest/intro.html

# Default value:
export columns=100

# Parse command options
while getopts "c:h" options; do
	case $options in
		c )	export columns=$OPTARG
			if ! [[ $columns =~ ^[0-9]+$ ]]
			then
				echo Number of columns $columns must be a positive number
				exit 1
			fi
			if [[ $columns -lt 5 ]] || [[ $columns -gt 1024 ]]
			then
				echo Unreasonable number of columns $columns 
				exit 1
			fi
			;;
		h | * ) echo "Usage: $usage"
			exit 1;;
	esac
done
shift $((OPTIND-1))

CLUSH="/usr/bin/clush"
PS="/bin/ps"
PSFLAGS="-o pid,nlwp,state,user,start,cputime,%cpu,rssize,command --columns $columns"

if test $# -ne 1
then
	echo ERROR: No jobid given
	echo Usage: $usage
	exit -1
fi
# Check that jobid is a number (possibly with _ separating jobid and arraytaskid)
if ! [[ ${1} =~ ^[0-9_]+$ ]]
then
	echo "ERROR: <jobid> must be a number or jobid_arraytaskid"
	echo "Usage: $usage"
	exit 1
fi

jobid=$1
# Check if this jobid can be inquired successfully.
JOBSTATE="`squeue -h -O State:. -j $jobid`"
if test "$?" != "0"
then
	echo Error inquiring about job $jobid 
	exit 1
fi

# Detect job arrays by counting number of words in JOBSTATE
words=( $JOBSTATE )
if [[ ${#words[@]} > 1 ]]
then
	echo "ERROR: The job $jobid is a job array with multiple jobs. Please select only one of the array jobs:"
	squeue -O JobArrayID,ArrayJobID,ArrayTaskID,JobID,StartTime,TimeUsed,TimeLimit -j $jobid
	exit 1
fi

if test "$JOBSTATE" != "RUNNING"
then
	echo The job $jobid is not running, it has state=$JOBSTATE
	exit 1
fi

# For an array job, get the individual jobid corresponding to the array job:
realjobid=`squeue -h -O JobID -j $jobid`

# Print some job information
if test "$realjobid" = "$jobid"
then
	squeue -O JobID:10,Partition:10,NumNodes:6,NumTasks:6,UserName:10,StartTime,TimeUsed:14,TimeLimit:14 -j $jobid
else
	# For array jobs
	squeue -O JobID:10,Partition:10,NumNodes:6,NumTasks:6,UserName:10,ArrayJobID:13,ArrayTaskID:14,StartTime,TimeUsed:14,TimeLimit:14 -j $jobid
fi
# Print the NodeList
NODELIST="`squeue -h -O NodeList: -j $jobid`"
echo NODELIST: $NODELIST

# Execute parallel shell on the job nodes:
# The "scontrol listpids" lists all processes belonging to a Slurm job $realjobid
# Count also the number of processes (numprocs) and threads (numthreads in $2=nlwp) when $1 is a number
$CLUSH -bw $NODELIST "$PS $PSFLAGS \$(scontrol listpids $realjobid | tail -n+3 | awk '{print \$1}') | grep -v ' root    ' | awk '{print \$0; if(\$1~/^[0-9]+\$/) {numprocs++; numthreads+=\$2}} END {printf(\"Total: %d processes and %d threads\n\", numprocs, numthreads)}'; echo -n Uptime:; uptime"

# $CLUSH -bw $NODELIST "$PS $PSFLAGS \$(scontrol listpids $realjobid | tail -n+3 | awk '{print \$1}') | grep -v ' root    '" 
# You could also use: $CLUSH -bw@sj:$realjobid ...
